# -*- coding: utf-8 -*-
# @Time    : 2019/12/20 14:21
# @Author  : Damn7Kx
# @Software: PyCharm
import datetime
import json
import scrapy
from NewsSpider.items import NewsItem
from NewsSpider.tools.utils import Utils
from NewsSpider.tools.filter_time import Times
from NewsSpider.tools.redis_db import Redis_DB
from w3lib.html import remove_tags


class MiErJunShiNews(scrapy.Spider):
    '''
        米尔军事App  固定api Post Formdata请求 详情页单独访问
    '''

    name = 'Mier'

    t = Times()
    redis = Redis_DB()
    types = [i for i in range(1,6)]

    # uA只设置了一个
    headers = {
        "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; G011A Build/LMY48Z; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Safari/537.36",
        "Connection": "close",
    }

    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'NewsSpider.middlewares.ProxyIPMiddleware': 544,
        },
        'ITEM_PIPELINES': {
            'NewsSpider.pipelines.KafkaPipeline': 544,
        }
    }

    def start_requests(self):
        url = 'http://api.wap.junshijia.com/api/apps/index.php?'
        for tp in self.types:
            for page in range(1,11):
                params = {
                    'apiVersion': 'v1',
                    'controller': 'News',
                    'channel': str(tp),
                    'page': str(page),
                    'action': 'newslist',
                }
                yield scrapy.FormRequest(url, headers=self.headers,callback=self.parse_text,formdata=params,dont_filter=True)

    def parse_text(self, response):
        print("正在访问列表页:", response.url)
        if response.text:
            datas = json.loads(response.text)
            data_ = datas['data']['newsLists']
            for d in data_:
                dicts = {}
                articles_id = d['id']
                title = d['title']
                pubdate_datetime = d['timeAgo']
                pubdate = str(self.t.datetimes(pubdate_datetime))
                dicts['pubdate'] = pubdate
                dicts['title'] = title
                if not self.t.time_is_Recent(pubdate):
                    yield None
                else:
                    url = f"http://api.wap.junshijia.com/api/apps/index.php?"
                    params = {
                        'apiVersion': 'v1',
                        'controller': 'Article',
                        'action': 'details',
                        'aid': str(articles_id)
                    }
                    yield scrapy.FormRequest(url, headers=self.headers, callback=self.parse, formdata=params,
                                             dont_filter=True,meta=dicts)

    def parse(self, response):

        datas = json.loads(response.text)
        item = NewsItem()
        data = datas['data']
        if data:
            url = data['shareUrl']
            id = Utils.url_hash(url)
            if self.redis.check_exist_2("wenzhangquchong", id, '') == 0:
                print('该id:%s已存在' % id)
                yield None
            else:
                item['id'] = id
                item['url'] = url
                item['title'] = response.meta['title']
                item['pubdate'] = response.meta['pubdate']
                body = data['webContent']
                content = remove_tags(body)
                item['content'] = content
                item['author'] = data['authorNickName']
                item['formats'] = "app"
                item['dataSource'] = "米尔军事"
                item['serchEnType'] = "米尔军事"
                item['html'] = body
                item['updateTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                item['collectProcess'] = 'crawl_news'
                item['serverIp'] = "113.128.12.74"
                # print(item)
                yield item
